#!pip install praw
Collecting praw
Downloading praw-7.6.1-py3-none-any.whl (188 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 188.8/188.8 KB 2.0 MB/s eta 0:00:0000:0100:01
Collecting prawcore<3,>=2.1
Downloading prawcore-2.3.0-py3-none-any.whl (16 kB)
Requirement already satisfied: websocket-client>=0.54.0 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from praw) (1.4.2)
Collecting update-checker>=0.18
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Requirement already satisfied: requests<3.0,>=2.6.0 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from prawcore<3,>=2.1->praw) (2.28.1)
Requirement already satisfied: certifi>=2017.4.17 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (2022.12.7)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (1.26.13)
Requirement already satisfied: idna<4,>=2.5 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (3.4)
Requirement already satisfied: charset-normalizer<3,>=2 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (2.1.1)
Installing collected packages: update-checker, prawcore, praw
Successfully installed praw-7.6.1 prawcore-2.3.0 update-checker-0.18.0
import praw
import pandas as pd
reddit_read_only = praw.Reddit(client_id="-FkFx07VGHhRLGZW9CNuRw", # your client id
client_secret="LP6ZvPq-t4OmaNm8xtIjkxHLRC7N0A", # your client secret
user_agent="MK scraper") # your user agent
subrdit = reddit_read_only.subreddit("AmITheAsshole")
# Display the name of the Subreddit
#print("Display Name:", subrdit.display_name)
# Display the description of the Subreddit
#print("Description:", subrdit.description)
subreddit = reddit_read_only.subreddit("AmITheAsshole")
for post in subreddit.top(limit=5):
print(post.title)
print()
AITA for telling my wife the lock on my daughter's door does not get removed til my brother inlaw and his daughters are out of our house? META: This sub is moving towards a value system that frequently doesn't align with the rest of the world UPDATE, AITA for despising my mentally handicap sister? AITA For suing my girlfriend after she had my 1967 impala project taken to the scrapyard? AITA for bringing my SIL’s wallet to the restaurant when she conveniently always forgets it?
posts = subreddit.top("year", limit = 800)
posts_dict = {'title' : [], 'body': [], 'score': [], 'id': [], 'top_comment_body' : [], 'top_comment_score': [], 'url': []}
i=0
for post in posts:
# Title of each post
posts_dict["title"].append(post.title)
# Text inside a post
posts_dict["body"].append(post.selftext)
# Unique ID of each post
posts_dict["id"].append(post.id)
# The score of a post
posts_dict["score"].append(post.score)
# Text inside the top comment of the post
posts_dict["top_comment_body"].append(post.comments[1].body)
# Score of the top comment of the post
posts_dict["top_comment_score"].append(post.comments[1].score)
# URL of each post
posts_dict["url"].append(post.url)
if i%10 == 0:
print("Done with post number ", i)
i += 1
# Saving the data in a pandas dataframe
top_posts = pd.DataFrame(posts_dict)
top_posts
Done with post number 0 Done with post number 10 Done with post number 20 Done with post number 30 Done with post number 40 Done with post number 50 Done with post number 60 Done with post number 70 Done with post number 80 Done with post number 90 Done with post number 100 Done with post number 110 Done with post number 120 Done with post number 130 Done with post number 140 Done with post number 150 Done with post number 160 Done with post number 170 Done with post number 180 Done with post number 190 Done with post number 200 Done with post number 210 Done with post number 220 Done with post number 230 Done with post number 240 Done with post number 250 Done with post number 260 Done with post number 270 Done with post number 280 Done with post number 290 Done with post number 300 Done with post number 310 Done with post number 320 Done with post number 330 Done with post number 340 Done with post number 350 Done with post number 360 Done with post number 370 Done with post number 380 Done with post number 390 Done with post number 400 Done with post number 410 Done with post number 420 Done with post number 430 Done with post number 440 Done with post number 450 Done with post number 460 Done with post number 470 Done with post number 480 Done with post number 490 Done with post number 500 Done with post number 510 Done with post number 520 Done with post number 530 Done with post number 540 Done with post number 550 Done with post number 560 Done with post number 570 Done with post number 580 Done with post number 590 Done with post number 600 Done with post number 610 Done with post number 620 Done with post number 630 Done with post number 640 Done with post number 650 Done with post number 660 Done with post number 670 Done with post number 680 Done with post number 690 Done with post number 700 Done with post number 710 Done with post number 720 Done with post number 730 Done with post number 740 Done with post number 750 Done with post number 760 Done with post number 770 Done with post number 780 Done with post number 790
| title | body | score | id | top_comment_body | top_comment_score | url | |
|---|---|---|---|---|---|---|---|
| 0 | AITA for bringing my SIL’s wallet to the resta... | Edit: update on profile\n\nMy (f28) SIL “Amy” ... | 68512 | x2k5kv | NTA. Stone cold busted. Next time she books an... | 1442 | https://www.reddit.com/r/AmItheAsshole/comment... |
| 1 | AITA for bringing up my brother's "premature" ... | I am a nurse practitioner and I am the primary... | 56259 | zvmflw | You can tell the family about the time you wer... | 678 | https://www.reddit.com/r/AmItheAsshole/comment... |
| 2 | AITA for not taking down my video that was a g... | I have a sister that’s 6 years older than me. ... | 54743 | wyjbjs | NTA\n\nMy parents missed my wedding too all be... | 1578 | https://www.reddit.com/r/AmItheAsshole/comment... |
| 3 | UPDATE AITA for walking out of the Airport whe... | Hello!.\n\n\nI don't know where to begin...it'... | 51464 | ur2l3s | I'm sorry you are going through this, but I'm ... | 18671 | https://www.reddit.com/r/AmItheAsshole/comment... |
| 4 | AITA for walking out of the Airport when I saw... | \n\nI F30 don't have the best relationship wit... | 50024 | unhse2 | Definitely NTA. You know that if you had sucke... | 9416 | https://www.reddit.com/r/AmItheAsshole/comment... |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 795 | AITA for saying I will never host another fami... | We had a family get together for Father’s Day.... | 14260 | vge8ha | NTA but a 45 year old man licked 6 slices of p... | 12475 | https://www.reddit.com/r/AmItheAsshole/comment... |
| 796 | AITA for bothering a woman at home? | My son (4) had a sleepover last night with a f... | 14266 | 10aeoi3 | YTA.\n\nText - no response\n\nCall - no respon... | 3533 | https://www.reddit.com/r/AmItheAsshole/comment... |
| 797 | AITA for having a "scary" kitty around children? | I (m27) have a cat named stargazer. I adopted... | 14251 | tiq83r | Guess they just lost their free babysitter the... | 5976 | https://www.reddit.com/r/AmItheAsshole/comment... |
| 798 | AITA for sticking to my(29M) guns when it came... | When my wife(31F) and I(29M) were trying to ge... | 14227 | uvoj8s | “Out of left field and creepy” is exactly righ... | 1 | https://www.reddit.com/r/AmItheAsshole/comment... |
| 799 | AITA for not telling my fiancé where I go on S... | So super weird situation and need a judgement.... | 14219 | sexfa3 | Girl wants you back. Who would bring a charger... | 82 | https://www.reddit.com/r/AmItheAsshole/comment... |
800 rows × 7 columns
top_posts.to_csv('TM_project/reddit_posts.csv', index=False)
import pickle # for loading (and saving) the previously web scraped data
import pandas as pd # for processing data in dataframes
import matplotlib.pyplot as plt # for plotting
import re # for cleaning textual data (uses regular expressions ouch!)
from collections import Counter # for counting tokens occurences
import math # for calculations
import nltk
from nltk.tokenize import word_tokenize # for tokenization
from nltk.stem import PorterStemmer # for stemming
from nltk.corpus import stopwords
# import stop_words # source: https://pypi.org/project/stop-words/#installation
# from stop_words import get_stop_words # alternative stopwords list
import gensim
from gensim import corpora # for: Dictionary(), word2bow()
from gensim import models # for: TfidfModel()
import statistics # for: quantiles()
import numpy as np # for some maths
import time # for measuring time of computation
def save_object(obj, filename):
with open(filename, 'wb') as output: # overwrites any existing file.
pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
#removing \n
post_texts = top_posts["body"].map(lambda x: re.sub('\n', ' ', x))
#removing all numbers
post_texts = post_texts.map(lambda x: re.sub(r'[0-9]+', '', x))
#removing ,\!?/:;''()``’“-”—#
post_texts = post_texts.map(lambda x: re.sub("[,\!?/:;''()``’“-”—#]", '', x))
#removing .
post_texts = post_texts.map(lambda x: re.sub(r"([.]+)", '', x))
#all letters to lover case
post_texts = post_texts.map(lambda x: x.lower())
#removing one-letter words
post_texts = post_texts.map(lambda x: re.sub(r'\b\w\b', '', x))
post_texts
0 edit update on profile my sil amy always co...
1 am nurse practitioner and am the primary ca...
2 have sister thats years older than me my ...
3 hello dont know where to beginits been an a...
4 dont have the best relationship with my hu...
...
795 we had family get together for fathers day my...
796 my son had sleepover last night with friend...
797 have cat named stargazer adopted her arou...
798 when my wifef and im were trying to get pregna...
799 so super weird situation and need judgement ...
Name: body, Length: 800, dtype: object
#word_tokenize() applied to every single text
for i in range(0,len(post_texts)):
post_texts[i] = word_tokenize(post_texts[i])
post_texts
0 [edit, update, on, profile, my, sil, amy, alwa...
1 [am, nurse, practitioner, and, am, the, primar...
2 [have, sister, thats, years, older, than, me, ...
3 [hello, dont, know, where, to, beginits, been,...
4 [dont, have, the, best, relationship, with, my...
...
795 [we, had, family, get, together, for, fathers,...
796 [my, son, had, sleepover, last, night, with, f...
797 [have, cat, named, stargazer, adopted, her, ar...
798 [when, my, wifef, and, im, were, trying, to, g...
799 [so, super, weird, situation, and, need, judge...
Name: body, Length: 800, dtype: object
ps = PorterStemmer()
for i in range(0,len(post_texts)):
words = []
for word in post_texts[i]:
words.append(ps.stem(word)) #stems every token in document and append it to a list
#it takes few minutes
post_texts
0 [edit, update, on, profile, my, sil, amy, alwa...
1 [am, nurse, practitioner, and, am, the, primar...
2 [have, sister, thats, years, older, than, me, ...
3 [hello, dont, know, where, to, beginits, been,...
4 [dont, have, the, best, relationship, with, my...
...
795 [we, had, family, get, together, for, fathers,...
796 [my, son, had, sleepover, last, night, with, f...
797 [have, cat, named, stargazer, adopted, her, ar...
798 [when, my, wifef, and, im, were, trying, to, g...
799 [so, super, weird, situation, and, need, judge...
Name: body, Length: 800, dtype: object
stop_words = nltk.corpus.stopwords.words('english') #one of stopwords dictionaries available in Python
# cleaning stopwords
stop_words = pd.Series(stop_words).map(lambda x: re.sub('\n', '', x))
stop_words = stop_words.map(lambda x: re.sub("[,\!?/:;''()``]", '', x))
stop_words = stop_words.map(lambda x: re.sub(r"([.]+)", '', x))
# stemming stopwords
ps = PorterStemmer()
for i in range(0,len(stop_words)):
stop_words[i] = ps.stem(stop_words[i])
#making stopwords back a list
stop_words = list(stop_words)
#adding some specific stopwords
stop_words.append('``')
stop_words.append("\'\'")
# removing stopwords from post texts
for i in range(0,len(post_texts)):
post_texts[i] = [word for word in post_texts[i] if not word in list(stop_words)]
post_texts
0 [edit, update, profile, sil, amy, always, come...
1 [nurse, practitioner, primary, care, provider,...
2 [sister, thats, years, older, parents, years, ...
3 [hello, know, beginits, absolute, nightmare, r...
4 [best, relationship, husbands, mom, since, day...
...
795 [family, get, together, fathers, day, older, b...
796 [son, sleepover, last, night, friend, this, fr...
797 [cat, named, stargazer, adopted, around, years...
798 [wifef, im, trying, get, pregnant, deal, boy, ...
799 [super, weird, situation, need, judgement, eng...
Name: body, Length: 800, dtype: object
top_posts["body_clean"] = post_texts
top_posts.head()
| title | body | score | id | top_comment_body | top_comment_score | url | body_clean | |
|---|---|---|---|---|---|---|---|---|
| 0 | AITA for bringing my SIL’s wallet to the resta... | Edit: update on profile\n\nMy (f28) SIL “Amy” ... | 68512 | x2k5kv | NTA. Stone cold busted. Next time she books an... | 1442 | https://www.reddit.com/r/AmItheAsshole/comment... | [edit, update, profile, sil, amy, always, come... |
| 1 | AITA for bringing up my brother's "premature" ... | I am a nurse practitioner and I am the primary... | 56259 | zvmflw | You can tell the family about the time you wer... | 678 | https://www.reddit.com/r/AmItheAsshole/comment... | [nurse, practitioner, primary, care, provider,... |
| 2 | AITA for not taking down my video that was a g... | I have a sister that’s 6 years older than me. ... | 54743 | wyjbjs | NTA\n\nMy parents missed my wedding too all be... | 1578 | https://www.reddit.com/r/AmItheAsshole/comment... | [sister, thats, years, older, parents, years, ... |
| 3 | UPDATE AITA for walking out of the Airport whe... | Hello!.\n\n\nI don't know where to begin...it'... | 51464 | ur2l3s | I'm sorry you are going through this, but I'm ... | 18671 | https://www.reddit.com/r/AmItheAsshole/comment... | [hello, know, beginits, absolute, nightmare, r... |
| 4 | AITA for walking out of the Airport when I saw... | \n\nI F30 don't have the best relationship wit... | 50024 | unhse2 | Definitely NTA. You know that if you had sucke... | 9416 | https://www.reddit.com/r/AmItheAsshole/comment... | [best, relationship, husbands, mom, since, day... |
def generate_ngrams(text, ngram = 1):
temp = zip(*[text[i:] for i in range(0,ngram)]) # set with pairs, three, ..., ns of tokens
ans = [' '.join(ngram) for ngram in temp] # joins the elements in strings
ans = pd.Series(ans).map(lambda x: re.sub(" ", '_', x)) # replaces spaces with '_'
return list(ans)
for i in range(0,len(post_texts)):
unigrams = post_texts[i]
bigrams = generate_ngrams(post_texts[i], ngram = 2)
trigrams = generate_ngrams(post_texts[i], ngram = 3)
text = []
text.append(unigrams)
text.append(bigrams)
text.append(trigrams)
post_texts[i] = [item for sublist in text for item in sublist]
post_texts
0 [edit, update, profile, sil, amy, always, come...
1 [nurse, practitioner, primary, care, provider,...
2 [sister, thats, years, older, parents, years, ...
3 [hello, know, beginits, absolute, nightmare, r...
4 [best, relationship, husbands, mom, since, day...
...
795 [family, get, together, fathers, day, older, b...
796 [son, sleepover, last, night, friend, this, fr...
797 [cat, named, stargazer, adopted, around, years...
798 [wifef, im, trying, get, pregnant, deal, boy, ...
799 [super, weird, situation, need, judgement, eng...
Name: body, Length: 800, dtype: object
#dictionary from gensim library = keys are: 1, 2, 3, ..., number of tokens; values are tokens' names
dictionary = corpora.Dictionary(post_texts)
#corpus from gensim library consists of so called bows
#every bow = keys are tokens' indexes; values are numbers of tokens' occurences in text
corpus = [dictionary.doc2bow(text) for text in post_texts]
tfidf_model = models.TfidfModel(corpus, id2word = dictionary)
def TFIDF(dictionary, corpus, which_text, tfidf_model):
bow = corpus[which_text]
tfidfdictionary = dict(tfidf_model[bow]) #TFIDF for tokens in a chosen text
#below: keys are tokens' names; values are numbers of tokens' occurences in text
TFIDFdictionary = dict((dictionary[key], value) for (key, value) in tfidfdictionary.items())
return(TFIDFdictionary)
TFIDF(dictionary, corpus, 0, tfidf_model)
{'*': 0.02909107242391753,
'*_make': 0.05215345646722317,
'*_make_fair': 0.05215345646722317,
'*_specifically': 0.05215345646722317,
'*_specifically_*': 0.05215345646722317,
'admit': 0.022992093374259996,
'admit_got': 0.04674551016017217,
'admit_got_this': 0.05215345646722317,
'aita': 0.004934951957707041,
'aita_taking': 0.04674551016017217,
'aita_taking_wallet': 0.05215345646722317,
'always': 0.027602293438124433,
'always_comes': 0.04674551016017217,
'always_comes_visit': 0.05215345646722317,
'always_conveniently': 0.05215345646722317,
'always_conveniently_forgets': 0.05215345646722317,
'always_wants': 0.041337563853121165,
'always_wants_go': 0.05215345646722317,
'amount': 0.024414569317916215,
'amount_money': 0.039596594001239084,
'amount_money_much': 0.05215345646722317,
'amy': 0.11452235417274861,
'amy_always': 0.05215345646722317,
'amy_always_comes': 0.05215345646722317,
'amy_called': 0.05215345646722317,
'amy_called_saw': 0.05215345646722317,
'amy_hopefully': 0.05215345646722317,
'amy_hopefully_reading': 0.05215345646722317,
'asked': 0.006721951543800229,
'asked_pay': 0.039596594001239084,
'asked_pay_back': 0.05215345646722317,
'asked_separate': 0.05215345646722317,
'asked_separate_bills': 0.05215345646722317,
'asshole': 0.014176868386002118,
'asshole_ill': 0.05215345646722317,
'asshole_ill_admit': 0.05215345646722317,
'awards': 0.034188647694188085,
'awards_jeez': 0.05215345646722317,
'awards_jeez_lol': 0.05215345646722317,
'back': 0.011012172367351048,
'back_inside': 0.034188647694188085,
'back_inside_found': 0.05215345646722317,
'back_never': 0.05215345646722317,
'back_never_has': 0.05215345646722317,
'badmouthing': 0.04674551016017217,
'badmouthing_internet': 0.05215345646722317,
'badmouthing_internet_honestly': 0.05215345646722317,
'because': 0.002270550677011268,
'because_forgot': 0.05215345646722317,
'because_forgot_wallet': 0.05215345646722317,
'before': 0.00719783178824592,
'before_left': 0.030048676889675205,
'before_left_made': 0.05215345646722317,
'bill': 0.0946904564305479,
'bill_asked': 0.05215345646722317,
'bill_asked_pay': 0.05215345646722317,
'bill_because': 0.04674551016017217,
'bill_because_forgot': 0.05215345646722317,
'bill_this': 0.04674551016017217,
'bill_this_might': 0.05215345646722317,
'bills': 0.025881756084899062,
'bills_said': 0.05215345646722317,
'bills_said_need': 0.05215345646722317,
'bringing': 0.022808508025950944,
'bringing_restaurant': 0.05215345646722317,
'bringing_restaurant_edit': 0.05215345646722317,
'call': 0.01150895528187117,
'called': 0.006607382417254549,
'called_saw': 0.05215345646722317,
'called_saw_this': 0.05215345646722317,
'cant': 0.018592991908553364,
'cant_keep': 0.0369714317839003,
'cant_keep_thank': 0.05215345646722317,
'cant_pay': 0.05215345646722317,
'cant_pay_share': 0.05215345646722317,
'car': 0.014801362977496785,
'car_pretended': 0.05215345646722317,
'car_pretended_forgot': 0.05215345646722317,
'care': 0.013767680076847257,
'care_amy': 0.05215345646722317,
'care_amy_hopefully': 0.05215345646722317,
'clear': 0.01911888418965894,
'clear_paying': 0.05215345646722317,
'clear_paying_bill': 0.05215345646722317,
'come': 0.009043776943138872,
'come_town': 0.05215345646722317,
'come_town_nonetheless': 0.05215345646722317,
'comes': 0.01786788805686598,
'comes_visit': 0.05215345646722317,
'comes_visit_town': 0.05215345646722317,
'comments': 0.02934480215481893,
'comments_cant': 0.04674551016017217,
'comments_cant_keep': 0.05215345646722317,
'comments_wake': 0.05215345646722317,
'comments_wake_call': 0.05215345646722317,
'conveniently': 0.05215345646722317,
'conveniently_forgets': 0.05215345646722317,
'conveniently_forgets_wallet': 0.05215345646722317,
'domes': 0.05215345646722317,
'domes_excuses': 0.05215345646722317,
'domes_excuses_why': 0.05215345646722317,
'done': 0.012556862465984078,
'done_eating': 0.05215345646722317,
'done_eating_asked': 0.05215345646722317,
'eating': 0.020473809777848063,
'eating_asked': 0.04674551016017217,
'eating_asked_separate': 0.05215345646722317,
'edit': 0.03465511850598997,
'edit_amy': 0.05215345646722317,
'edit_amy_called': 0.05215345646722317,
'edit_update': 0.0369714317839003,
'edit_update_profile': 0.05215345646722317,
'edit_wow': 0.043582064364633875,
'edit_wow_thanks': 0.05215345646722317,
'episode': 0.04674551016017217,
'episode_two': 0.05215345646722317,
'episode_two_half': 0.05215345646722317,
'every': 0.01176904661013826,
'every_time': 0.024873643918128827,
'every_time_come': 0.05215345646722317,
'everyone': 0.010661392172983898,
'everyone_something': 0.04674551016017217,
'everyone_something_say': 0.05215345646722317,
'excuses': 0.029180892558275585,
'excuses_why': 0.043582064364633875,
'excuses_why_cant': 0.05215345646722317,
'expensive': 0.04422907639921285,
'expensive_restaurant': 0.05215345646722317,
'expensive_restaurant_last': 0.05215345646722317,
'expensive_restaurants': 0.05215345646722317,
'expensive_restaurants_always': 0.05215345646722317,
'extremely': 0.04235434039577244,
'extremely_expensive': 0.05215345646722317,
'extremely_expensive_restaurant': 0.05215345646722317,
'extremely_furious': 0.05215345646722317,
'extremely_furious_said': 0.05215345646722317,
'fair': 0.022282329985197233,
'fair_amount': 0.04674551016017217,
'fair_amount_money': 0.05215345646722317,
'forgets': 0.04674551016017217,
'forgets_wallet': 0.05215345646722317,
'forgets_wallet_domes': 0.05215345646722317,
'forgot': 0.05471645088696174,
'forgot_something': 0.05215345646722317,
'forgot_something_went': 0.05215345646722317,
'forgot_wallet': 0.04674551016017217,
'forgot_wallet_reached': 0.05215345646722317,
'found': 0.010973514486475417,
'found_wallet': 0.05215345646722317,
'found_wallet_sitting': 0.05215345646722317,
'furious': 0.021325784544935195,
'furious_said': 0.039596594001239084,
'furious_said_touched': 0.05215345646722317,
'go': 0.004990220316228659,
'go_expensive': 0.05215345646722317,
'go_expensive_restaurants': 0.05215345646722317,
'got': 0.002617364098430153,
'got_this': 0.035929617546070165,
'got_this_move': 0.05215345646722317,
'grabbed': 0.025113724931968156,
'grabbed_wallet': 0.05215345646722317,
'grabbed_wallet_aita': 0.05215345646722317,
'half': 0.019465697611077828,
'half_men': 0.05215345646722317,
'half_men_leaving': 0.05215345646722317,
'has': 0.007178687603053892,
'has_implied': 0.05215345646722317,
'has_implied_since': 0.05215345646722317,
'has_made': 0.0369714317839003,
'has_made_reservation': 0.05215345646722317,
'honestly': 0.018062948649659606,
'honestly_care': 0.043582064364633875,
'honestly_care_amy': 0.05215345646722317,
'hopefully': 0.0315634854768493,
'hopefully_reading': 0.05215345646722317,
'hopefully_reading_comments': 0.05215345646722317,
'hotel': 0.025617255591598793,
'hotel_always': 0.05215345646722317,
'hotel_always_wants': 0.05215345646722317,
'husband': 0.014152225961455234,
'husband_pay': 0.04674551016017217,
'husband_pay_*': 0.05215345646722317,
'husband_went': 0.0369714317839003,
'husband_went_car': 0.05215345646722317,
'ill': 0.01354314609701713,
'ill_admit': 0.038174118057582876,
'ill_admit_got': 0.05215345646722317,
'implied': 0.034188647694188085,
'implied_since': 0.05215345646722317,
'implied_since_make': 0.05215345646722317,
'inside': 0.01958481469305312,
'inside_found': 0.05215345646722317,
'inside_found_wallet': 0.05215345646722317,
'instead': 0.013654605382874378,
'instead_hotel': 0.043582064364633875,
'instead_hotel_always': 0.05215345646722317,
'internet': 0.0315634854768493,
'internet_honestly': 0.05215345646722317,
'internet_honestly_care': 0.05215345646722317,
'jeez': 0.04674551016017217,
'jeez_lol': 0.05215345646722317,
'jeez_lol_many': 0.05215345646722317,
'keep': 0.012316781452144751,
'keep_thank': 0.05215345646722317,
'keep_thank_everyone': 0.05215345646722317,
'last': 0.008859468750773293,
'last_night': 0.022808508025950944,
'last_night_before': 0.05215345646722317,
'leaving': 0.018062948649659606,
'leaving_husband': 0.05215345646722317,
'leaving_husband_went': 0.05215345646722317,
'left': 0.0076524468185637075,
'left_made': 0.04674551016017217,
'left_made_clear': 0.05215345646722317,
'lol': 0.025617255591598793,
'lol_many': 0.05215345646722317,
'lol_many_comments': 0.05215345646722317,
'made': 0.013583018730921954,
'made_clear': 0.02960272595499357,
'made_clear_paying': 0.05215345646722317,
'made_reservation': 0.04674551016017217,
'made_reservation_extremely': 0.05215345646722317,
'make': 0.014995892754891205,
'make_fair': 0.05215345646722317,
'make_fair_amount': 0.05215345646722317,
'make_much': 0.038174118057582876,
'make_much_money': 0.043582064364633875,
'many': 0.014672401077409466,
'many_comments': 0.038174118057582876,
'many_comments_cant': 0.04674551016017217,
'men': 0.025881756084899062,
'men_leaving': 0.05215345646722317,
'men_leaving_husband': 0.05215345646722317,
'might': 0.016542332829378864,
'might_asshole': 0.04674551016017217,
'might_asshole_ill': 0.05215345646722317,
'money': 0.023103412337326646,
'money_much': 0.04674551016017217,
'money_much_treat': 0.05215345646722317,
'money_one': 0.05215345646722317,
'money_one_pay': 0.05215345646722317,
'move': 0.016542332829378864,
'move_straight': 0.05215345646722317,
'move_straight_episode': 0.05215345646722317,
'much': 0.015252966685256148,
'much_money': 0.033445036020718126,
'much_money_one': 0.05215345646722317,
'much_treat': 0.05215345646722317,
'much_treat_someone': 0.05215345646722317,
'need': 0.011637917181958491,
'need_one': 0.04674551016017217,
'need_one_bill': 0.05215345646722317,
'never': 0.00786332498863238,
'never_has': 0.05215345646722317,
'never_has_made': 0.05215345646722317,
'night': 0.011637917181958491,
'night_before': 0.034188647694188085,
'night_before_left': 0.05215345646722317,
'nonetheless': 0.043582064364633875,
'nonetheless_past': 0.05215345646722317,
'nonetheless_past_paid': 0.05215345646722317,
'one': 0.008499941713390111,
'one_bill': 0.05215345646722317,
'one_bill_because': 0.05215345646722317,
'one_pay': 0.05215345646722317,
'one_pay_husband': 0.05215345646722317,
'paid': 0.018786833340891568,
'paid_bill': 0.04674551016017217,
'paid_bill_asked': 0.05215345646722317,
'past': 0.01584317721532692,
'past_paid': 0.05215345646722317,
'past_paid_bill': 0.05215345646722317,
'pay': 0.05062000937043441,
'pay_*': 0.05215345646722317,
'pay_*_specifically': 0.05215345646722317,
'pay_back': 0.0321416771590372,
'pay_back_never': 0.05215345646722317,
'pay_husband': 0.05215345646722317,
'pay_husband_pay': 0.05215345646722317,
'pay_share': 0.05215345646722317,
'pay_share_has': 0.05215345646722317,
'paying': 0.01767758551716605,
'paying_bill': 0.05215345646722317,
'paying_bill_this': 0.05215345646722317,
'post': 0.017491814423691122,
'post_yelled': 0.05215345646722317,
'post_yelled_badmouthing': 0.05215345646722317,
'pretended': 0.035929617546070165,
'pretended_forgot': 0.05215345646722317,
'pretended_forgot_something': 0.05215345646722317,
'profile': 0.03276617175053187,
'profile_sil': 0.05215345646722317,
'profile_sil_amy': 0.05215345646722317,
'purse': 0.07002134452408915,
'purse_said': 0.043582064364633875,
'purse_said_this': 0.05215345646722317,
'purse_went': 0.04674551016017217,
'purse_went_restaurant': 0.05215345646722317,
'put': 0.010435230908275916,
'put_purse': 0.05215345646722317,
'put_purse_went': 0.05215345646722317,
'reached': 0.024194779647942567,
'reached_purse': 0.05215345646722317,
'reached_purse_said': 0.05215345646722317,
'reading': 0.026733730851986198,
'reading_comments': 0.035929617546070165,
'reading_comments_wake': 0.05215345646722317,
'reservation': 0.038174118057582876,
'reservation_extremely': 0.05215345646722317,
'reservation_extremely_expensive': 0.05215345646722317,
'restaurant': 0.06585083740928958,
'restaurant_done': 0.05215345646722317,
'restaurant_done_eating': 0.05215345646722317,
'restaurant_edit': 0.05215345646722317,
'restaurant_edit_wow': 0.05215345646722317,
'restaurant_last': 0.05215345646722317,
'restaurant_last_night': 0.05215345646722317,
'restaurants': 0.041337563853121165,
'restaurants_always': 0.05215345646722317,
'restaurants_always_conveniently': 0.05215345646722317,
'right': 0.009491556547070306,
'right_top': 0.04674551016017217,
'right_top_suitcase': 0.05215345646722317,
'said': 0.0036324528325895316,
'said_need': 0.034188647694188085,
'said_need_one': 0.05215345646722317,
'said_this': 0.02163178522820401,
'said_this_wallet': 0.05215345646722317,
'said_touched': 0.05215345646722317,
'said_touched_grabbed': 0.05215345646722317,
'saw': 0.012176200760157992,
'saw_this': 0.038174118057582876,
'saw_this_post': 0.05215345646722317,
'say': 0.008135199789966125,
'say_edit': 0.04674551016017217,
'say_edit_amy': 0.05215345646722317,
'separate': 0.027690276292248236,
'separate_bills': 0.05215345646722317,
'separate_bills_said': 0.05215345646722317,
'share': 0.02088817355473405,
'share_has': 0.05215345646722317,
'share_has_implied': 0.05215345646722317,
'sil': 0.023772946251224582,
'sil_amy': 0.05215345646722317,
'sil_amy_always': 0.05215345646722317,
'since': 0.004843699252915866,
'since_make': 0.04674551016017217,
'since_make_much': 0.05215345646722317,
'sitting': 0.019348371831445058,
'sitting_right': 0.05215345646722317,
'sitting_right_top': 0.05215345646722317,
'someone': 0.014420701271670697,
'someone_every': 0.05215345646722317,
'someone_every_time': 0.05215345646722317,
'something': 0.018025509929239392,
'something_say': 0.041337563853121165,
'something_say_edit': 0.05215345646722317,
'something_went': 0.04674551016017217,
'something_went_back': 0.05215345646722317,
'specifically': 0.030048676889675205,
'specifically_*': 0.05215345646722317,
'specifically_*_make': 0.05215345646722317,
'stays': 0.03052167123901916,
'stays_us': 0.04674551016017217,
'stays_us_instead': 0.05215345646722317,
'straight': 0.025113724931968156,
'straight_episode': 0.05215345646722317,
'straight_episode_two': 0.05215345646722317,
'suitcase': 0.043582064364633875,
'suitcase_put': 0.05215345646722317,
'suitcase_put_purse': 0.05215345646722317,
'taking': 0.014866652206352325,
'taking_wallet': 0.05215345646722317,
'taking_wallet_bringing': 0.05215345646722317,
'thank': 0.01679003672347147,
'thank_everyone': 0.025881756084899062,
'thank_everyone_something': 0.05215345646722317,
'thanks': 0.021789407100577425,
'thanks_awards': 0.05215345646722317,
'thanks_awards_jeez': 0.05215345646722317,
'this': 0.004752284754299111,
'this_might': 0.03276617175053187,
'this_might_asshole': 0.05215345646722317,
'this_move': 0.043582064364633875,
'this_move_straight': 0.05215345646722317,
'this_post': 0.02735822544348087,
'this_post_yelled': 0.05215345646722317,
'this_wallet': 0.05215345646722317,
'this_wallet_was': 0.05215345646722317,
'time': 0.0038885496472256796,
'time_come': 0.04674551016017217,
'time_come_town': 0.05215345646722317,
'top': 0.025617255591598793,
'top_suitcase': 0.05215345646722317,
'top_suitcase_put': 0.05215345646722317,
'touched': 0.038174118057582876,
'touched_grabbed': 0.05215345646722317,
'touched_grabbed_wallet': 0.05215345646722317,
'town': 0.048389559295885134,
'town_nonetheless': 0.05215345646722317,
'town_nonetheless_past': 0.05215345646722317,
'town_stays': 0.05215345646722317,
'town_stays_us': 0.05215345646722317,
'treat': 0.02735822544348087,
'treat_someone': 0.05215345646722317,
'treat_someone_every': 0.05215345646722317,
'two': 0.010933809478042688,
'two_half': 0.04674551016017217,
'two_half_men': 0.05215345646722317,
'update': 0.015695964974008842,
'update_profile': 0.043582064364633875,
'update_profile_sil': 0.05215345646722317,
'us': 0.006791509365460977,
'us_instead': 0.05215345646722317,
'us_instead_hotel': 0.05215345646722317,
'visit': 0.019465697611077828,
'visit_town': 0.05215345646722317,
'visit_town_stays': 0.05215345646722317,
'wake': 0.02960272595499357,
'wake_call': 0.04674551016017217,
'wallet': 0.26149238618780324,
'wallet_aita': 0.05215345646722317,
'wallet_aita_taking': 0.05215345646722317,
'wallet_bringing': 0.05215345646722317,
'wallet_bringing_restaurant': 0.05215345646722317,
'wallet_domes': 0.05215345646722317,
'wallet_domes_excuses': 0.05215345646722317,
'wallet_reached': 0.05215345646722317,
'wallet_reached_purse': 0.05215345646722317,
'wallet_sitting': 0.05215345646722317,
'wallet_sitting_right': 0.05215345646722317,
'wallet_was': 0.05215345646722317,
'wallet_was_extremely': 0.05215345646722317,
'wants': 0.014932492405676552,
'wants_go': 0.039596594001239084,
'wants_go_expensive': 0.05215345646722317,
'was': 0.00017755040764926922,
'was_extremely': 0.033445036020718126,
'was_extremely_furious': 0.05215345646722317,
'went': 0.014368366730440266,
'went_back': 0.025617255591598793,
'went_back_inside': 0.043582064364633875,
'went_car': 0.039596594001239084,
'went_car_pretended': 0.05215345646722317,
'went_restaurant': 0.041337563853121165,
'went_restaurant_done': 0.05215345646722317,
'why': 0.008107582000707988,
'why_cant': 0.035929617546070165,
'why_cant_pay': 0.05215345646722317,
'wow': 0.027039731535255008,
'wow_thanks': 0.05215345646722317,
'wow_thanks_awards': 0.05215345646722317,
'yelled': 0.016624030092291506,
'yelled_badmouthing': 0.05215345646722317,
'yelled_badmouthing_internet': 0.05215345646722317}
d_tfidf = {}
for i in range(0,len(corpus)): # for each text
data = TFIDF(dictionary, corpus, i, tfidf_model) # calculate TFIDF values for text's tokens
for token, value in data.items(): # next, for each token and its TFIDF value in text, prepare a dictionary
# with tokens' names as keys and list of TF-IDFs as values
d_tfidf.setdefault(token, []).append(value)
tfidf_values = [item for sublist in list(d_tfidf.values()) for item in sublist]
plt.hist(tfidf_values, bins=1000)
plt.xlabel('TF-IDF')
plt.ylabel('Number of tokens with certain TF-IDF value')
plt.xlim([0, 0.1])
plt.show()
for i in [0.01,0.02,0.03,0.04,0.05,0.1,0.2,0.3,0.4,0.5]:
print('Quantile ',i*100,'%: ',np.quantile(tfidf_values,i),sep='')
Quantile 1.0%: 0.003735010551285883 Quantile 2.0%: 0.0052788350576095724 Quantile 3.0%: 0.006499689239911252 Quantile 4.0%: 0.0076187378989144745 Quantile 5.0%: 0.008691952582100246 Quantile 10.0%: 0.013784480315353988 Quantile 20.0%: 0.023034908582879814 Quantile 30.0%: 0.02915079373277821 Quantile 40.0%: 0.03479737109459152 Quantile 50.0%: 0.038053893410117355
import pickle # for saving objects
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px # for nice plotting
import warnings
import math
from nltk.tokenize import RegexpTokenizer # for LSA in sklearn, we will need additional tokenizer
from sklearn.feature_extraction.text import CountVectorizer # one can consider LSA with DF in DTM...
from sklearn.feature_extraction.text import TfidfVectorizer # or with TF-IDF values in DTM
from sklearn.decomposition import LatentDirichletAllocation # LDA implementation
def save_object(obj, filename):
with open(filename, 'wb') as output: # Overwrites any existing file.
pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
# as our preprocessed data is already tokenized
# therefore, we need to make them strings again...
def listToString(s):
str1 = ""
for ele in s:
str1 += ele+" "
return str1
top_posts["body_clean_str"] = top_posts["body_clean"] # new column, for now a copy of tokenized and preprocessed texts
for i in range(0,len(top_posts)):
top_posts["body_clean_str"][i] = listToString(top_posts["body_clean_str"][i])
top_posts.head()
| title | body | score | id | top_comment_body | top_comment_score | url | body_clean | body_clean_str | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | AITA for bringing my SIL’s wallet to the resta... | Edit: update on profile\n\nMy (f28) SIL “Amy” ... | 68512 | x2k5kv | NTA. Stone cold busted. Next time she books an... | 1442 | https://www.reddit.com/r/AmItheAsshole/comment... | [edit, update, profile, sil, amy, always, come... | edit update profile sil amy always comes visit... |
| 1 | AITA for bringing up my brother's "premature" ... | I am a nurse practitioner and I am the primary... | 56259 | zvmflw | You can tell the family about the time you wer... | 678 | https://www.reddit.com/r/AmItheAsshole/comment... | [nurse, practitioner, primary, care, provider,... | nurse practitioner primary care provider lot l... |
| 2 | AITA for not taking down my video that was a g... | I have a sister that’s 6 years older than me. ... | 54743 | wyjbjs | NTA\n\nMy parents missed my wedding too all be... | 1578 | https://www.reddit.com/r/AmItheAsshole/comment... | [sister, thats, years, older, parents, years, ... | sister thats years older parents years cancel ... |
| 3 | UPDATE AITA for walking out of the Airport whe... | Hello!.\n\n\nI don't know where to begin...it'... | 51464 | ur2l3s | I'm sorry you are going through this, but I'm ... | 18671 | https://www.reddit.com/r/AmItheAsshole/comment... | [hello, know, beginits, absolute, nightmare, r... | hello know beginits absolute nightmare recentl... |
| 4 | AITA for walking out of the Airport when I saw... | \n\nI F30 don't have the best relationship wit... | 50024 | unhse2 | Definitely NTA. You know that if you had sucke... | 9416 | https://www.reddit.com/r/AmItheAsshole/comment... | [best, relationship, husbands, mom, since, day... | best relationship husbands mom since day one t... |
warnings.filterwarnings("ignore") #ignoring popping up warnings
tokenizer = RegexpTokenizer(r'\w+') # tokenizer
tf_vectorizer = CountVectorizer(ngram_range = (1, 3), #let us use unigrams for now, to make the calculations quicker
max_df = 0.75, #filtering with document frequency
min_df = 5/len(top_posts["body_clean_str"]), #filtering with document frequency
tokenizer = tokenizer.tokenize
)
tf = tf_vectorizer.fit_transform(top_posts["body_clean_str"])
tf_feature_names = tf_vectorizer.get_feature_names()
tfidf_vectorizer = TfidfVectorizer(ngram_range = (1, 3), #let us use unigrams for now, to make the calculations quicker
max_df = 0.75, #filtering with document frequency
min_df = 5/len(top_posts["body_clean_str"]), #filtering with document frequency
tokenizer = tokenizer.tokenize
)
tfidf = tfidf_vectorizer.fit_transform(top_posts["body_clean_str"])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
def get_umass_score(dt_matrix, i, j):
zo_matrix = (dt_matrix > 0).astype(int)
col_i, col_j = zo_matrix[:, i], zo_matrix[:, j]
col_ij = col_i + col_j
col_ij = (col_ij == 2).astype(int)
Di, Dij = col_i.sum(), col_ij.sum()
return math.log((Dij + 1) / Di)
def get_topic_coherence(dt_matrix, topic, n_top_words):
indexed_topic = zip(topic, range(0, len(topic)))
topic_top = sorted(indexed_topic, key=lambda x: 1 - x[0])[0:n_top_words]
coherence = 0
for j_index in range(0, len(topic_top)):
for i_index in range(0, j_index - 1):
i = topic_top[i_index][1]
j = topic_top[j_index][1]
coherence += get_umass_score(dt_matrix, i, j)
return coherence
def get_average_topic_coherence(dt_matrix, topics, n_top_words):
total_coherence = 0
for i in range(0, len(topics)):
total_coherence += get_topic_coherence(dt_matrix, topics[i], n_top_words)
return total_coherence / len(topics)
measures_specific = []
for n_topics in range(2,51,1):
print('Trying parameters:', n_topics)
lda = LatentDirichletAllocation(n_components = n_topics,
learning_method = 'online',
learning_offset = 50.0,
max_iter = 5,
random_state = 42)
lda.fit(tf)
avg_coherence = get_average_topic_coherence(tf, lda.components_, 25)
measures_specific.append([avg_coherence, n_topics])
Trying parameters: 2 Trying parameters: 3 Trying parameters: 4 Trying parameters: 5 Trying parameters: 6 Trying parameters: 7 Trying parameters: 8 Trying parameters: 9 Trying parameters: 10 Trying parameters: 11 Trying parameters: 12 Trying parameters: 13 Trying parameters: 14 Trying parameters: 15 Trying parameters: 16 Trying parameters: 17 Trying parameters: 18 Trying parameters: 19 Trying parameters: 20 Trying parameters: 21 Trying parameters: 22 Trying parameters: 23 Trying parameters: 24 Trying parameters: 25 Trying parameters: 26 Trying parameters: 27 Trying parameters: 28 Trying parameters: 29 Trying parameters: 30 Trying parameters: 31 Trying parameters: 32 Trying parameters: 33 Trying parameters: 34 Trying parameters: 35 Trying parameters: 36 Trying parameters: 37 Trying parameters: 38 Trying parameters: 39 Trying parameters: 40 Trying parameters: 41 Trying parameters: 42 Trying parameters: 43 Trying parameters: 44 Trying parameters: 45 Trying parameters: 46 Trying parameters: 47 Trying parameters: 48 Trying parameters: 49 Trying parameters: 50
# below, we make the output (list) a pandas DataFrame with intuitive colnames
measures_specific_df_lda = pd.DataFrame(measures_specific).rename(columns={
0: 'avg_coherence', 1: 'n_topics'
})
save_object(measures_specific_df_lda, 'TM_project/measures_specific_df_lda.pkl')
with open("TM_project/measures_specific_df_lda.pkl", "rb") as fp:
measures_specific_df_lda = pickle.load(fp)
plt.style.use("fivethirtyeight")
plt.plot(measures_specific_df_lda['n_topics'],measures_specific_df_lda['avg_coherence'])
plt.xlabel("No. of topics")
plt.ylabel("Average topic coherence")
plt.show()
measures_specific_df_lda.sort_values('avg_coherence', ascending = False).iloc[0:9,:]
| avg_coherence | n_topics | |
|---|---|---|
| 0 | -197.258110 | 2 |
| 6 | -218.464319 | 8 |
| 9 | -223.506711 | 11 |
| 10 | -225.799363 | 12 |
| 1 | -227.682847 | 3 |
| 11 | -230.803568 | 13 |
| 12 | -232.100892 | 14 |
| 7 | -237.269628 | 9 |
| 8 | -239.096575 | 10 |
warnings.filterwarnings("ignore") #ignoring popping up warnings
measures_specific = []
for n_topics in range(2,51,1):
print('Trying parameters:', n_topics)
lda = LatentDirichletAllocation(n_components = n_topics,
learning_method = 'online',
learning_offset = 50.0,
max_iter = 5,
random_state = 42)
lda.fit(tfidf)
avg_coherence = get_average_topic_coherence(tfidf, lda.components_, 25)
measures_specific.append([avg_coherence, n_topics])
Trying parameters: 2 Trying parameters: 3 Trying parameters: 4 Trying parameters: 5 Trying parameters: 6 Trying parameters: 7 Trying parameters: 8 Trying parameters: 9 Trying parameters: 10 Trying parameters: 11 Trying parameters: 12 Trying parameters: 13 Trying parameters: 14 Trying parameters: 15 Trying parameters: 16 Trying parameters: 17 Trying parameters: 18 Trying parameters: 19 Trying parameters: 20 Trying parameters: 21 Trying parameters: 22 Trying parameters: 23 Trying parameters: 24 Trying parameters: 25 Trying parameters: 26 Trying parameters: 27 Trying parameters: 28 Trying parameters: 29 Trying parameters: 30 Trying parameters: 31 Trying parameters: 32 Trying parameters: 33 Trying parameters: 34 Trying parameters: 35 Trying parameters: 36 Trying parameters: 37 Trying parameters: 38 Trying parameters: 39 Trying parameters: 40 Trying parameters: 41 Trying parameters: 42 Trying parameters: 43 Trying parameters: 44 Trying parameters: 45 Trying parameters: 46 Trying parameters: 47 Trying parameters: 48 Trying parameters: 49 Trying parameters: 50
# below, we make the output (list) a pandas DataFrame with intuitive colnames
measures_specific_tfidf_lda = pd.DataFrame(measures_specific).rename(columns={
0: 'avg_coherence', 1: 'n_topics'
})
save_object(measures_specific_tfidf_lda, 'TM_project/measures_specific_tfidf_lda.pkl')
with open("TM_project/measures_specific_tfidf_lda.pkl", "rb") as fp:
measures_specific_tfidf_lda = pickle.load(fp)
plt.style.use("fivethirtyeight")
plt.plot(measures_specific_tfidf_lda['n_topics'],measures_specific_tfidf_lda['avg_coherence'])
plt.xlabel("No. of topics")
plt.ylabel("Average topic coherence")
plt.show()
measures_specific_tfidf_lda.sort_values('avg_coherence', ascending = False).iloc[0:9,:]
| avg_coherence | n_topics | |
|---|---|---|
| 0 | -235.322070 | 2 |
| 3 | -335.975590 | 5 |
| 1 | -415.732856 | 3 |
| 4 | -419.810625 | 6 |
| 2 | -481.555180 | 4 |
| 5 | -487.373452 | 7 |
| 7 | -501.306549 | 9 |
| 9 | -504.806318 | 11 |
| 12 | -513.255496 | 14 |
# here we consider the previously presented LatentDirichletAllocation() function, still with less parameters
lda = LatentDirichletAllocation(n_components = 11,
learning_method = 'online',
learning_offset = 80.0,
max_iter = 5,
random_state = 42)
lda.fit(tfidf)
LatentDirichletAllocation(learning_method='online', learning_offset=80.0,
max_iter=5, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. LatentDirichletAllocation(learning_method='online', learning_offset=80.0,
max_iter=5, random_state=42)for index, component in enumerate(lda.components_): #taking model's components
#(values from reconstructed Document-Term Matrix)
zipped = zip(tf_feature_names, component) #taking together tokens' names with components
top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:10] #top 10 terms per topic
top_terms_list=list(dict(top_terms_key).keys()) #taking only tokens, no weights
print("Topic "+str(index)+": ",top_terms_list) #prints top 10 tokens per topic
Topic 0: ['his', 'his mom', 'college', 'school', 'princess', 'asked why said', 'family', 'girls', 'mom', 'going'] Topic 1: ['listen', 'anymore', 'hes', 'big', 'his', 'asking was', 'begging', 'wedding', 'special', 'celebratory dinner'] Topic 2: ['adam', 'work', 'shouting', 'think its', 'this morning', 'dog', 'very much', 'drink', 'fair', 'would like'] Topic 3: ['his', 'family', 'house', 'hold', 'started', 'like', 'get', 'husband', 'mom', 'friend'] Topic 4: ['approached', 'full time', 'year old daughter', 'welcome', 'dead', 'couple days', 'cancelled', 'comment', 'bother', 'son his'] Topic 5: ['his', 'husband', 'im', 'mom', 'sister', 'family', 'like', 'wife', 'get', 'parents'] Topic 6: ['his', 'brother', 'blue', 'years', 'new', 'im', 'dead', 'job', 'husband', 'fund'] Topic 7: ['im', 'sister', 'carrying', 'parents', 'went', 'ive', 'was serious', 'income', 'saturday', 'still'] Topic 8: ['son', 'husband', 'tradition', 'family dinner', 'said was', 'dress', 'always', 'his', 'give', 'heart'] Topic 9: ['want', 'husband', 'daughter', 'made', 'its', 'gone', 'cake', 'wife', 'thats', 'any']
params = []
for alpha in [0.0001, 0.001, 0.01, 0.05, 0.1]:
for beta in [0.0001, 0.001, 0.01, 0.05, 0.1]:
for vectorizer_name in ['tf','tf-idf']:
if(vectorizer_name == 'tf'):
print(alpha, beta, 'tf')
lda = LatentDirichletAllocation(n_components = 11,
doc_topic_prior = alpha,
topic_word_prior = beta,
learning_method = 'online',
learning_offset = 10.0,
max_iter = 5,
random_state = 42)
lda.fit(tf)
avg_coherence = get_average_topic_coherence(tf, lda.components_, 25)
params.append([alpha, beta, 'tf', avg_coherence])
if(vectorizer_name == 'tf-idf'):
print(alpha, beta, 'tf-idf')
lda = LatentDirichletAllocation(n_components = 11,
doc_topic_prior = alpha,
topic_word_prior = beta,
learning_method = 'online',
learning_offset = 10.0,
max_iter = 5,
random_state = 42)
lda.fit(tfidf)
avg_coherence = get_average_topic_coherence(tf, lda.components_, 25)
params.append([alpha, beta, 'tf-idf', avg_coherence])
0.0001 0.0001 tf 0.0001 0.0001 tf-idf 0.0001 0.001 tf 0.0001 0.001 tf-idf 0.0001 0.01 tf 0.0001 0.01 tf-idf 0.0001 0.05 tf 0.0001 0.05 tf-idf 0.0001 0.1 tf 0.0001 0.1 tf-idf 0.001 0.0001 tf 0.001 0.0001 tf-idf 0.001 0.001 tf 0.001 0.001 tf-idf 0.001 0.01 tf 0.001 0.01 tf-idf 0.001 0.05 tf 0.001 0.05 tf-idf 0.001 0.1 tf 0.001 0.1 tf-idf 0.01 0.0001 tf 0.01 0.0001 tf-idf 0.01 0.001 tf 0.01 0.001 tf-idf 0.01 0.01 tf 0.01 0.01 tf-idf 0.01 0.05 tf 0.01 0.05 tf-idf 0.01 0.1 tf 0.01 0.1 tf-idf 0.05 0.0001 tf 0.05 0.0001 tf-idf 0.05 0.001 tf 0.05 0.001 tf-idf 0.05 0.01 tf 0.05 0.01 tf-idf 0.05 0.05 tf 0.05 0.05 tf-idf 0.05 0.1 tf 0.05 0.1 tf-idf 0.1 0.0001 tf 0.1 0.0001 tf-idf 0.1 0.001 tf 0.1 0.001 tf-idf 0.1 0.01 tf 0.1 0.01 tf-idf 0.1 0.05 tf 0.1 0.05 tf-idf 0.1 0.1 tf 0.1 0.1 tf-idf
# below, we make the output (list) a pandas DataFrame with intuitive colnames
params_df = pd.DataFrame(params).rename(columns={
0: 'alpha', 1: 'beta', 2: 'vectorizer', 3: 'avg_coherence'
})
save_object(params_df, 'TM_project/params_df.pkl')
with open("TM_project/params_df.pkl", "rb") as fp:
params_df = pickle.load(fp)
params_df.sort_values('avg_coherence', ascending = False).iloc[0:9,:]
| alpha | beta | vectorizer | avg_coherence | |
|---|---|---|---|---|
| 38 | 0.0500 | 0.10 | tf | -355.996279 |
| 46 | 0.1000 | 0.05 | tf | -356.302434 |
| 36 | 0.0500 | 0.05 | tf | -358.401506 |
| 28 | 0.0100 | 0.10 | tf | -358.964835 |
| 18 | 0.0010 | 0.10 | tf | -359.791277 |
| 8 | 0.0001 | 0.10 | tf | -359.829972 |
| 4 | 0.0001 | 0.01 | tf | -366.320263 |
| 14 | 0.0010 | 0.01 | tf | -366.845765 |
| 44 | 0.1000 | 0.01 | tf | -369.176138 |
fig = px.scatter(params_df[params_df['vectorizer']=='tf'], x="alpha", y="beta", color="avg_coherence")
fig.show()
fig = px.scatter(params_df[params_df['vectorizer']=='tf-idf'], x="alpha", y="beta", color="avg_coherence")
fig.show()
lda = LatentDirichletAllocation(n_components = 11, # let us stay with 11, as that is what topic coherence initially recommended
doc_topic_prior = 0.0500,
topic_word_prior = 0.10,
learning_method = 'online',
learning_offset = 10.0,
max_iter = 20,
random_state = 42)
lda.fit(tf) # TF for now
topics_lists = []
for index, component in enumerate(lda.components_): #taking model's components
#(values from reconstructed Document-Term Matrix)
zipped = zip(tf_feature_names, component) #taking together tokens' names with components
top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:10] #top 10 terms per topic
top_terms_list=list(dict(top_terms_key).keys()) #taking only tokens, no weights
topics_lists.append(top_terms_list)
print("Topic "+str(index)+": ",top_terms_list) #prints top 10 tokens per topic
Topic 0: ['his', 'im', 'because', 'like', 'husband', 'got', 'get', 'time', 'its', 'has'] Topic 1: ['gold', 'digger', 'gold digger', 'bf', 'hold', 'high', 'route', 'suppose', 'correctly', 'seconds'] Topic 2: ['im', 'his', 'because', 'like', 'get', 'its', 'one', 'got', 'even', 'before'] Topic 3: ['dress', 'wedding', 'wear', 'makeup', 'wearing', 'bride', 'wedding dress', 'dresses', 'color', 'bridesmaids'] Topic 4: ['his', 'daughter', 'daughter asked', 'friend', 'got', 'because', 'since', 'could', 'asked', 'land'] Topic 5: ['seat', 'seats', 'tattoo', 'flight', 'plane', 'his seat', 'attendant', 'exactly', 'next', 'brother law'] Topic 6: ['his', 'his mom', 'mom', 'got', 'husband', 'home', 'us', 'get', 'like', 'one'] Topic 7: ['amy', 'ava', 'lisa', 'watch', 'italian', 'bill', 'sex', 'grandfather', 'profile', 'family'] Topic 8: ['his', 'family', 'because', 'mom', 'would', 'wife', 'got', 'has', 'dad', 'like'] Topic 9: ['hair', 'jake', 'police', 'black', 'back', 'went', 'asked', 'name', 'office', 'coworkers'] Topic 10: ['his', 'snooping', 'husband', 'his mom', 'asked', 'got', 'im', 'bedroom', 'since', 'mom']
import os
import openai
from IPython.display import Image
from IPython import display
from base64 import b64decode
openai.api_key = "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"
images = []
for i in range(len(topics_lists)):
try:
topic_prompt = " ".join(topics_lists[i])
response = openai.Image.create(
prompt=topic_prompt,
n=1,
size="512x512",
response_format="b64_json"
)
images.append((i,response['data'][0]['b64_json']))
print(i)
except:
images.append((i, np.nan))
print(i)
print("too NSFW for OpenAI")
0 1 2 3 4 5 6 7 too NSFW for OpenAI 8 9 10
image_df = pd.DataFrame(images, columns =['topic', 'image'])
image_df["words"] = topics_lists
image_df.head()
| topic | image | words | |
|---|---|---|---|
| 0 | 0 | iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA... | [his, im, because, like, husband, got, get, ti... |
| 1 | 1 | iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA... | [gold, digger, gold digger, bf, hold, high, ro... |
| 2 | 2 | iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA... | [im, his, because, like, get, its, one, got, e... |
| 3 | 3 | iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA... | [dress, wedding, wear, makeup, wearing, bride,... |
| 4 | 4 | iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA... | [his, daughter, daughter asked, friend, got, b... |
len(image_df)
11
for i in range(len(image_df)):
print("Image for topic ", i, " with words:")
print(image_df.loc[i,"words"])
try:
display.display(display.Image(b64decode(image_df.loc[i,"image"])))
except:
print("Too NSFW for OpenAI")
Image for topic 0 with words: ['his', 'im', 'because', 'like', 'husband', 'got', 'get', 'time', 'its', 'has']
Image for topic 1 with words: ['gold', 'digger', 'gold digger', 'bf', 'hold', 'high', 'route', 'suppose', 'correctly', 'seconds']
Image for topic 2 with words: ['im', 'his', 'because', 'like', 'get', 'its', 'one', 'got', 'even', 'before']
Image for topic 3 with words: ['dress', 'wedding', 'wear', 'makeup', 'wearing', 'bride', 'wedding dress', 'dresses', 'color', 'bridesmaids']
Image for topic 4 with words: ['his', 'daughter', 'daughter asked', 'friend', 'got', 'because', 'since', 'could', 'asked', 'land']
Image for topic 5 with words: ['seat', 'seats', 'tattoo', 'flight', 'plane', 'his seat', 'attendant', 'exactly', 'next', 'brother law']
Image for topic 6 with words: ['his', 'his mom', 'mom', 'got', 'husband', 'home', 'us', 'get', 'like', 'one']
Image for topic 7 with words: ['amy', 'ava', 'lisa', 'watch', 'italian', 'bill', 'sex', 'grandfather', 'profile', 'family'] Too NSFW for OpenAI Image for topic 8 with words: ['his', 'family', 'because', 'mom', 'would', 'wife', 'got', 'has', 'dad', 'like']
Image for topic 9 with words: ['hair', 'jake', 'police', 'black', 'back', 'went', 'asked', 'name', 'office', 'coworkers']
Image for topic 10 with words: ['his', 'snooping', 'husband', 'his mom', 'asked', 'got', 'im', 'bedroom', 'since', 'mom']
df_topics_for_posts = pd.DataFrame(lda.transform(tf).tolist())
df_topics_for_posts.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.902614 | 0.000288 | 0.000288 | 0.000288 | 0.000288 | 0.000288 | 0.000288 | 0.052027 | 0.032663 | 0.010679 | 0.000288 |
| 1 | 0.000188 | 0.000188 | 0.000188 | 0.000188 | 0.000188 | 0.000188 | 0.000188 | 0.290750 | 0.707562 | 0.000188 | 0.000188 |
| 2 | 0.105910 | 0.000203 | 0.000203 | 0.000203 | 0.000203 | 0.000203 | 0.000203 | 0.000203 | 0.800131 | 0.092337 | 0.000203 |
| 3 | 0.821660 | 0.000164 | 0.000164 | 0.000164 | 0.000164 | 0.000164 | 0.000164 | 0.000164 | 0.176868 | 0.000164 | 0.000164 |
| 4 | 0.942130 | 0.000163 | 0.000163 | 0.000163 | 0.000163 | 0.000163 | 0.000163 | 0.000163 | 0.056407 | 0.000163 | 0.000163 |
top_posts_final = pd.merge(top_posts, round(df_topics_for_posts*100, 3), left_index=True, right_index=True)
top_posts_final.head()
| title | body | score | id | top_comment_body | top_comment_score | url | body_clean | body_clean_str | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AITA for bringing my SIL’s wallet to the resta... | Edit: update on profile\n\nMy (f28) SIL “Amy” ... | 68512 | x2k5kv | NTA. Stone cold busted. Next time she books an... | 1442 | https://www.reddit.com/r/AmItheAsshole/comment... | [edit, update, profile, sil, amy, always, come... | edit update profile sil amy always comes visit... | 90.261 | 0.029 | 0.029 | 0.029 | 0.029 | 0.029 | 0.029 | 5.203 | 3.266 | 1.068 | 0.029 |
| 1 | AITA for bringing up my brother's "premature" ... | I am a nurse practitioner and I am the primary... | 56259 | zvmflw | You can tell the family about the time you wer... | 678 | https://www.reddit.com/r/AmItheAsshole/comment... | [nurse, practitioner, primary, care, provider,... | nurse practitioner primary care provider lot l... | 0.019 | 0.019 | 0.019 | 0.019 | 0.019 | 0.019 | 0.019 | 29.075 | 70.756 | 0.019 | 0.019 |
| 2 | AITA for not taking down my video that was a g... | I have a sister that’s 6 years older than me. ... | 54743 | wyjbjs | NTA\n\nMy parents missed my wedding too all be... | 1578 | https://www.reddit.com/r/AmItheAsshole/comment... | [sister, thats, years, older, parents, years, ... | sister thats years older parents years cancel ... | 10.591 | 0.020 | 0.020 | 0.020 | 0.020 | 0.020 | 0.020 | 0.020 | 80.013 | 9.234 | 0.020 |
| 3 | UPDATE AITA for walking out of the Airport whe... | Hello!.\n\n\nI don't know where to begin...it'... | 51464 | ur2l3s | I'm sorry you are going through this, but I'm ... | 18671 | https://www.reddit.com/r/AmItheAsshole/comment... | [hello, know, beginits, absolute, nightmare, r... | hello know beginits absolute nightmare recentl... | 82.166 | 0.016 | 0.016 | 0.016 | 0.016 | 0.016 | 0.016 | 0.016 | 17.687 | 0.016 | 0.016 |
| 4 | AITA for walking out of the Airport when I saw... | \n\nI F30 don't have the best relationship wit... | 50024 | unhse2 | Definitely NTA. You know that if you had sucke... | 9416 | https://www.reddit.com/r/AmItheAsshole/comment... | [best, relationship, husbands, mom, since, day... | best relationship husbands mom since day one t... | 94.213 | 0.016 | 0.016 | 0.016 | 0.016 | 0.016 | 0.016 | 0.016 | 5.641 | 0.016 | 0.016 |
save_object(top_posts_final, 'TM_project/final_df.pkl')
os.system('jupyter nbconvert --to html TM_project/Code_for_LDA.ipynb')
[NbConvertApp] Converting notebook TM_project/Code_for_LDA.ipynb to html [NbConvertApp] Writing 10251414 bytes to TM_project/Code_for_LDA.html
0